# 代码3-10
import nltk
from nltk.corpus import PlaintextCorpusReader
corpus_root = './data'  # 本地存放影视作品文本的目录
wordlists = PlaintextCorpusReader(corpus_root, '.*')  # 获取语料库中的文本标识列表
print(wordlists.fileids())  # 获取文件列表
# 代码3-11
with open('./data/琅琊榜.txt', 'r', encoding='utf-8') as f:  # 打开文本
    fiction = f.read()  # 读取文本
    len(set(fiction))  # 统计用词量
    len(fiction)/len(set(fiction))  # 平均每个词的使用次数

import re
import jieba
# 判断中文
cleaned_data = ''.join(re.findall('[\u4e00-\u9fa5]', fiction))
jieba.load_userdict('./data/userdict.txt')
wordlist = jieba.lcut(cleaned_data)  # 分词处理
text = nltk.Text(wordlist)  # 封装
print(text)


import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']
words = ['安陵容','沈眉庄','甄嬛','皇帝']
nltk.draw.dispersion.dispersion_plot(text, words)
plt.show()